# Data wrangling Library
library(tidyverse)
library(dplyr)
# Visualize data
library(ggplot2)
library(inspectdf)
library(GGally)
# Naive Bayes
library(e1071)
# Splitting Data
library(rsample)
# Random Forest
library(randomForest)
# Smote for unbalanced data
library(DMwR)
# ROCR
library(ROCR)
# Confussion Matrix
library(caret)
telemark <- read_csv2("data/bank-full.csv")
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Parsed with column specification:
## cols(
## age = col_double(),
## job = col_character(),
## marital = col_character(),
## education = col_character(),
## default = col_character(),
## balance = col_double(),
## housing = col_character(),
## loan = col_character(),
## contact = col_character(),
## day = col_double(),
## month = col_character(),
## duration = col_double(),
## campaign = col_double(),
## pdays = col_double(),
## previous = col_double(),
## poutcome = col_character(),
## y = col_character()
## )
glimpse(telemark)
## Observations: 45,211
## Variables: 17
## $ age <dbl> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57,…
## $ job <chr> "management", "technician", "entrepreneur", "blue-collar", …
## $ marital <chr> "married", "single", "married", "married", "single", "marri…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown",…
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no"…
## $ balance <dbl> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 7…
## $ housing <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes…
## $ loan <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no…
## $ contact <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unk…
## $ day <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may", "ma…
## $ duration <dbl> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517…
## $ campaign <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ pdays <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,…
## $ previous <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ poutcome <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unk…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no",…
Input variables:
1. age: age (numeric)
2. job : type of job (categorical: “admin.”,“unknown”,“unemployed”,“management”,“housemaid”,“entrepreneur”,“student”, “blue-collar”,“self-employed”,“retired”,“technician”,“services”)
3. marital : marital status (categorical: “married”,“divorced”,“single”; note: “divorced” means divorced or widowed)
4. education : education (categorical: “unknown”,“secondary”,“primary”,“tertiary”)
5. default: has credit in default? (binary: “yes”,“no”)
6. balance: average yearly balance, in euros (numeric)
7. housing: has housing loan? (binary: “yes”,“no”)
8. loan: has personal loan? (binary: “yes”,“no”)
9. contact: contact communication type (categorical: “unknown”,“telephone”,“cellular”)
10. day: last contact day of the month (numeric)
11. month: last contact month of year (categorical: “jan”, “feb”, “mar”, …, “nov”, “dec”)
12. duration: last contact duration, in seconds (numeric)
13. campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
14. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
15. previous: number of contacts performed before this campaign and for this client (numeric)
16. poutcome: outcome of the previous marketing campaign (categorical: “unknown”,“other”,“failure”,“success”)
17. y: has the client subscribed a term deposit? (binary: “yes”,“no”)
table(is.na(telemark))
##
## FALSE
## 768587
telemark <- telemark %>%
mutate(job = as.factor(job),
marital = as.factor(marital),
education = as.factor(education),
default = as.factor(default),
housing = as.factor(housing),
loan = as.factor(loan),
contact = as.factor(contact),
month = as.factor(month),
poutcome = as.factor(poutcome),
subscribe = as.factor(y)) %>%
select(-c(y))
summary(telemark)
## age job marital education
## Min. :18.00 blue-collar:9732 divorced: 5207 primary : 6851
## 1st Qu.:33.00 management :9458 married :27214 secondary:23202
## Median :39.00 technician :7597 single :12790 tertiary :13301
## Mean :40.94 admin. :5171 unknown : 1857
## 3rd Qu.:48.00 services :4154
## Max. :95.00 retired :2264
## (Other) :6835
## default balance housing loan contact
## no :44396 Min. : -8019 no :20081 no :37967 cellular :29285
## yes: 815 1st Qu.: 72 yes:25130 yes: 7244 telephone: 2906
## Median : 448 unknown :13020
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
##
## day month duration campaign
## Min. : 1.00 may :13766 Min. : 0.0 Min. : 1.000
## 1st Qu.: 8.00 jul : 6895 1st Qu.: 103.0 1st Qu.: 1.000
## Median :16.00 aug : 6247 Median : 180.0 Median : 2.000
## Mean :15.81 jun : 5341 Mean : 258.2 Mean : 2.764
## 3rd Qu.:21.00 nov : 3970 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :31.00 apr : 2932 Max. :4918.0 Max. :63.000
## (Other): 6060
## pdays previous poutcome subscribe
## Min. : -1.0 Min. : 0.0000 failure: 4901 no :39922
## 1st Qu.: -1.0 1st Qu.: 0.0000 other : 1840 yes: 5289
## Median : -1.0 Median : 0.0000 success: 1511
## Mean : 40.2 Mean : 0.5803 unknown:36959
## 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :871.0 Max. :275.0000
##
show_plot(inspect_cor(subset(telemark, select = -c(subscribe))))
ggcorr(telemark, label = T)
## Warning in ggcorr(telemark, label = T): data in column(s) 'job', 'marital',
## 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome',
## 'subscribe' are not numeric and were ignored
numericCols <- unlist(lapply(telemark, is.numeric))
show_plot(inspect_num(telemark[,numericCols]))
prop.table(table(telemark$subscribe))
##
## no yes
## 0.8830152 0.1169848
set.seed(1)
split <- initial_split(data = telemark, prop = 0.8, strata = subscribe)
telemark_train <- training(split)
telemark_test <- testing(split)
prop.table(table(telemark_train$subscribe))
##
## no yes
## 0.8832426 0.1167574
# telemark_train_upsample <- upSample(x = telemark_train[, -17], y = telemark_train$subscribe, yname = "subscribe")
telemark_train_upsample <- SMOTE(subscribe ~ ., as.data.frame(telemark_train), perc.over = 100, perc.under = 200)
prop.table(table(telemark_train_upsample$subscribe))
##
## no yes
## 0.5 0.5
model_naive <- naiveBayes(subscribe ~ ., data = telemark_train_upsample)
naive_prediction <- predict(model_naive, telemark_test)
naive_prediction_raw <- as.data.frame(predict(model_naive, telemark_test, type = "raw"))
naive_prediction_raw <- naive_prediction_raw %>%
mutate(no = round(no,4),
yes = round(yes,4))
naive_matrix <- confusionMatrix(naive_prediction, telemark_test$subscribe)
table <- as.table(naive_matrix)
table <- as.data.frame(table)
table %>% ggplot(aes(x = Prediction, y = Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), fontface = "bold", color = "white") +
theme_minimal() +
theme(legend.position = "none")
matrix_1 <- as.data.frame(t(as.matrix(naive_matrix, what = "overall")))
matrix_2 <- as.data.frame(t(as.matrix(naive_matrix, what = "classes")))
matrix <- cbind(matrix_1, matrix_2)
matrix %>% select(Accuracy, Sensitivity, Specificity, "Pos Pred Value") %>%
t()
## [,1]
## Accuracy 0.7614466
## Sensitivity 0.7550150
## Specificity 0.8095685
## Pos Pred Value 0.9673896
# ROC
naive_roc <- data.frame(prediction = naive_prediction_raw[,2],
trueclass = as.numeric(telemark_test$subscribe=="yes"))
head(naive_roc)
## prediction trueclass
## 1 0.1772 0
## 2 0.4306 0
## 3 0.1510 0
## 4 0.1432 0
## 5 0.0785 0
## 6 0.0646 0
naive_roc <- prediction(naive_roc$prediction, naive_roc$trueclass)
# ROC curve
plot(performance(naive_roc, "tpr", "fpr"),
main = "ROC")
abline(a = 0, b = 1)
# AUC
auc_ROCR_n <- performance(naive_roc, measure = "auc")
auc_ROCR_n <- auc_ROCR_n@y.values[[1]]
auc_ROCR_n
## [1] 0.8456519
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# model tuning - metrics function
metrics <- function(cutoff, prob, ref, postarget, negtarget)
{
predict <- as.factor(ifelse(prob >= cutoff, postarget, negtarget))
conf <- caret::confusionMatrix(predict , ref, positive = postarget)
acc <- conf$overall[1]
rec <- conf$byClass[1]
prec <- conf$byClass[3]
spec <- conf$byClass[2]
mat <- t(as.matrix(c(rec , acc , prec, spec)))
colnames(mat) <- c("recall", "accuracy", "precicion", "specificity")
return(mat)
}
co <- seq(0.01,0.99,length=100)
result <- matrix(0,100,4)
# apply function metrics
for(i in 1:100){
result[i,] = metrics(cutoff = co[i],
prob = naive_prediction_raw$yes,
ref = as.factor(ifelse(telemark_test$subscribe == "yes", 1, 0)),
postarget = "1",
negtarget = "0")
}
# visualize
ggplotly(tibble("Recall" = result[,1],
"Accuracy" = result[,2],
"Precision" = result[,3],
"Specificity" = result[,4],
"Cutoff" = co) %>%
gather(key = "Metrics", value = "value", 1:4) %>%
ggplot(aes(x = Cutoff, y = value, col = Metrics)) +
geom_line(lwd = 1.5) +
scale_color_manual(values = c("darkred","darkgreen","orange", "blue")) +
scale_y_continuous(breaks = seq(0,1,0.1), limits = c(0,1)) +
scale_x_continuous(breaks = seq(0,1,0.1)) +
labs(title = "Tradeoff Model Perfomance") +
theme_minimal() +
theme(legend.position = "top",
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank()))